library(haven)
library(tictoc)
library(lfe)
library(fixest)
library(data.table)
library(rqdatatable)
library(dplyr)
library(ggplot2)
library(fst)
library(stringr)
library(magrittr)
library(arrow)
library(broom)
library(igraph)

###############
# Directories #
###############

sasrep <- "./data/sasrep/"
fstrep <- "./data/fstrep/"
saverep <- "./saverep/"
clusterrep <- "./data/clusterrep/"
estimrep <- "./estimrep"
nomenclatures <- "./data/nomenclatures/"

sasfilename <- list()
for (i in (1:19)){
  sasfilename[i] <- paste(sasrep, "salarp", 2001+i, ".sas7bdat", sep = "")
}


#############
# Functions #
#############

# Data preparation and selection
#################################

dataprep <- function(d){
  tic("dataprep")
  d <- d[,.(year, NBHEUR, DUREE, AGE, ident, firm, lnhwage, DEPT, REGT, SEXE, 
            occ, ind, est, nb_workers, lnwage, CATJUR, DOMEMPL)]
  d <- d[NBHEUR>100,]
  d <- d[DUREE>=90,]
  
  d[, "age2" := AGE^2]
 
  d <- na.omit(d, cols=c("AGE", "age2", "ident", "firm", "year", "lnhwage", "SEXE"))
  
  d[, hyearmean := mean(lnhwage, na.rm=TRUE), by = year]
  d[, loghourly := lnhwage - hyearmean]
  
  d[, yearlyfirmsize := uniqueN(ident), by=firm]
  d[, CJ := as.numeric(substr(CATJUR, 1, 1))] 
  
  names(d)[names(d)=="ident"] <- "indiv_id"
  names(d)[names(d)=="firm"] <- "firm_id"
  
  toc()
  
  d
}

fst_translate <- function(year){
  tic("fst_translate")
  print(paste("loading DADS year", year))
  d <- as.data.table(read_sas(sasfilename[[year-2002+1]]))
  d <- dataprep(d)
  print("save prepared dataframe")
  write.fst(d, paste(fstrep, year, ".fst", sep=""))
  toc()
}


# A few descriptive stats
firmdec <- function(d){
  tic("between-within firm decomposition")
  
  meany <- d[, list(mean(lnhwage, na.rm=TRUE)), by = "year"]
  vary <- d[, list(var(loghourly, na.rm=TRUE)), by = "year"]
  decompo_firm <- merge(meany, vary, by="year")
  
  d[, "my":= mean(loghourly),by= c("firm_id", "year")]
  d[, "diff_y" := loghourly - my]
  
  # Between-Within decomposition
  between <- d[,var(my, use="complete.obs"), by = "year"]
  decompo_firm <- merge(decompo_firm, between, by="year")
  within <- d[,var(diff_y, use="complete.obs"), by = "year"]
  decompo_firm <- merge(decompo_firm, within, by="year")
  names(decompo_firm)<-c("year","mean_logwage","vary","between","within")
  
  # All period values
  m <- d[, mean(lnhwage, na.rm=TRUE)]
  v <- d[, var(loghourly, na.rm=TRUE)]
  d[, "my":= mean(loghourly),by= c("firm_id")]
  d[, "diff_y" := loghourly - my]
  b <- d[, var(my, use="complete.obs")]
  w <- d[, var(diff_y, use="complete.obs")]
  
  decompo_firm <- rbind(c("period",m,v,b,w), data.frame(decompo_firm))
  
  toc()
  decompo_firm
}

firmquant <- function(d){
  tic("quantiles of firm size")
  quant <- d[, quantile(yearlyfirmsize, na.rm = TRUE, probs = seq(0, 1, 1/10)), by="year"]
  
  toc()
  quant
}

nunits <- function(d){
  nindiv <- d[,length(unique(indiv_id))]  # n. of individuals (unique indiv ident)
  nfirm <- d[,length(unique(firm_id))]    # n. of firms
  nobs <- nrow(d)                         # n. of observations
  d[, nbfirms := (length(unique(firm_id))), by = indiv_id]
                                          # n. of unique firms by individual
  d[, mover := nbfirms>1]
  nmover <- d[mover==1,length(unique(indiv_id))] # n. of movers
  res <- c(nobs,nindiv,nfirm,nmover)
  res
}

statdes <- function(d){
  tic("descriptive stats")
  yearsex <- setDT(d)[, .(`count` = .N), by = c("year","SEXE")]
  departements <- d[, .(`count` = .N), by = c("year","DEPT")]
  departements <- reshape(departements, idvar = "DEPT", timevar = "year", direction = "wide")
  regions <- d[, .(`count` = .N), by = c("year","REGT")]
  regions <- reshape(regions, idvar = "REGT", timevar = "year", direction = "wide")
  n_units <- nunits(d)
  quant <- firmquant(d)
  between <- firmdec(d)
  
  stat_list <- list("sex"=yearsex,"dep"=departements,"reg"=regions,
                    "size"=quant,"between"=between,"obs"=n_units)
  
  toc()
  stat_list
}

detailed_statdes <- function(d){
  tic("detailed descriptive stats")
  sex <- d[, .(`count` = .N), by = c("SEXE")][order(SEXE)]
  names(sex) <- c("value","nobs")
  sex$variable <- "Sex"
  departements <- d[, .(`count` = .N), by = c("DEPT")][order(DEPT)]
  names(departements) <- c("value","nobs")
  departements$variable <- "Departement"
  regions <- d[, .(`count` = .N), by = c("REGT")][order(REGT)]
  names(regions) <- c("value","nobs")
  regions$variable <- "Region"
  industries <- d[, .(`count` = .N), by = c("ind")][order(ind)]
  names(industries) <- c("value","nobs")
  industries$variable <- "Industry"
  occupations <- d[, .(`count` = .N), by = c("occ")][order(occ)]
  names(occupations) <- c("value","nobs")
  occupations$variable <- "Occupation"
  sizecat <- d[,firmsize_cat:=cut(firmsize, breaks=c(0,20,200,1000,Inf))][, .(`count` = .N), by = c("firmsize_cat")][order(firmsize_cat)]
  names(sizecat) <- c("value","nobs")
  sizecat$variable <- "Firmsize group"
  
  statdes <- rbind(sex, departements, regions, industries, occupations, sizecat)

  toc()
  
  return(statdes)
}

# AKM estimation with or without split sampling
###############################################

# random split from random permutation before split
fast_split <- function(d, split_type="unbalanced"){
  print("starting sample splitting")
  print(paste("method :", split_type))
  tic("splitting")
  d <- setDT(d)

# Splitting observations, balanced by indiv ("period split")
  #For each individual, observations are split in the two samples as equally as possible
  if (split_type=="periods"){
    n <- nrow(d)
    rows <- sample(n)
    d <- d[rows,]
    d <- d[order(indiv_id),]
    d$split <- rep(c(0,1),length.out=n)
    toc()
    return(d)
  }
  
  if (split_type=="consec_periods"){
    d <- d[order(year)]
    d <- d[order(indiv_id)]
    d[,r:=seq_len(.N), by=indiv_id]
    d[,c("limit","order"):=list(sample(1:(.N-1),1),sample(c(0,1),1)), by=indiv_id]
    d[,split:=((r<=limit)*(1-order)+(r>limit)*(order)), by=indiv_id]
    d[,c("r","limit","order"):=NULL]
    toc()
    return(d)
  }
  
# Splitting indiv ("firm split")
  if (split_type=="unbalanced"){
    indiv <- unique(d[,.(indiv_id)])
    n <- nrow(indiv)
    rows <- sample(n)
    indiv <- indiv[rows,]
    print(paste("Number of unique individuals before split :", n))
    indiv$split <- rep(c(0,1),length.out=n)
    d <- merge(d, indiv, by="indiv_id", all.x=TRUE)
    toc()
    return(d)
  }
  
  if (split_type=="by_movers"){
    d[, mover := (uniqueN(firm_id)>1), by = indiv_id]
    indiv <- unique(d[,.(indiv_id, mover)])
    n <- nrow(indiv)
    rows <- sample(n)
    indiv <- indiv[rows,]
    indiv <- indiv[order(mover)]
    print(paste("Number of unique individuals before split :", n))
    indiv$split <- rep(c(0,1),length.out=n)
    d <- merge(d, indiv[.(indiv_id,split)], by="indiv_id", all.x=TRUE)
    toc()
    return(d)
  }
  
  if (split_type=="by_firm"){
    d$split <- NA
    years <- unique(d$year) #l'annee ou un individu apparait pour la 1ere fois on le classe dans un split
    for (y in sample(years)){
      indiv <- unique(d[(year==y)&(is.na(split)),.(indiv_id,firm_id)], by="indiv_id")
      n <- nrow(indiv)
      rows <- sample(n)
      indiv <- indiv[rows,]
      indiv <- indiv[order(firm_id)]
      print(paste("Year", y, " number of unique individuals before split :", n))
      indiv$split <- rep(c(0,1),length.out=n)
      d[indiv, split := i.split, on = .(indiv_id)]
      d[,split := as.numeric(split)]
    }
    toc()
    return(d)
  }
  
  if (split_type=="by_firm_movers_alt"){
    tic("mover compute")
    d[, nbfirms := (length(unique(firm_id))), by = indiv_id]
    d[, mover := nbfirms>1]
    d$split <- NA
    toc()
    tic("non-movers split")
    di <- d[mover==F]
    indiv <- unique(di[,.(indiv_id,firm_id)])
    n <- nrow(indiv)
    rows <- sample(n)
    indiv <- indiv[rows,]
    indiv <- indiv[order(firm_id)]
    print(paste("Number of non-mover individuals before split :", n))
    indiv$split <- rep(c(0,1),length.out=n)
    di[indiv, split := i.split, on = .(indiv_id)]
    toc()
    
    tic("movers split")
    dm <- d[mover==T]
    years <- unique(dm$year) #l'annee ou un individu apparait pour la 1ere fois on le classe dans un split
    for (y in sample(years)){
      indivm <- unique(dm[(year==y)&(is.na(split)),.(indiv_id,firm_id)], by="indiv_id")
      n <- nrow(indivm)
      rows <- sample(n)
      indivm <- indivm[rows,]
      indivm <- indivm[order(firm_id)]
      print(paste("Year", y, " number of unique movers before split :", n))
      indivm$split <- rep(c(0,1),length.out=n)
      dm[indivm, split := i.split, on = .(indiv_id)]
    }
    toc()
    tic("binding")
    d <- rbind(di, dm)
    d[,split := as.numeric(split)]
    rm(di)
    rm(dm)
    toc()
    gc()
    toc()
    return(d)
  }
  
  if (split_type=="by_firm_indiv"){
    d$split <- NA
    findiv <- unique(d[,.(indiv_id,firm_id)], by=c("indiv_id","firm_id"))
    n <- nrow(findiv)
    rows <- sample(n)
    findiv <- findiv[rows,]
    findiv <- findiv[order(firm_id)]
    print(paste("number of unique individual-firm couples before split :", n))
    findiv$split <- rep(c(0,1),length.out=n)
    d[findiv, split := i.split, on = .(indiv_id, firm_id)]
    d[,split := as.numeric(split)]
    toc()
    return(d)
  }
  
  if (split_type=="by_firm_movers_old"){
    d[, nbfirms := (length(unique(firm_id))), by = indiv_id]
    d[, mover := nbfirms>1]
    d$split <- NA
    years <- unique(d$year) #l'annee ou un individu apparait pour la 1ere fois on le classe dans un split
    for (y in sample(years)){
      indiv <- unique(d[(year==y)&(is.na(split)),.(indiv_id,firm_id,mover)], by = "indiv_id")
      n <- nrow(indiv)
      rows <- sample(n)
      indiv <- indiv[rows,]
      indiv <- indiv[order(firm_id,mover)]
      print(paste("Year", y, " number of unique individuals before split :", n))
      indiv$split <- rep(c(0,1),length.out=n)
      d[indiv, split := i.split, on = .(indiv_id)]
      d[,split := as.numeric(split)]
    }
    toc()
    return(d)
  }
  
  if (split_type=="by_firm_movers"){
    n <- nrow(d)
    rows <- sample(n)
    d <- d[rows,]
    findiv <- unique(d[,.(indiv_id,firm_id)], by=c("indiv_id","firm_id"))
    findiv[, nbfirms := .N, by=indiv_id]
    findiv <- findiv[order(nbfirms)]
    indiv <- unique(findiv, by=c("indiv_id"))
    indiv <- indiv[order(firm_id)]
    n <- nrow(indiv)
    indiv$split <- rep(c(0,1),length.out=n)
    d[indiv, split := i.split, on = .(indiv_id)]
    d[,split := as.numeric(split)]
    toc()
    return(d)
  }
}

akm_est <- function(d, method="yearly mean"){
  tic("estimation akm")
  d <- d[d$comp==1,]
  if (method=="yearly mean"){
    print("starting AKM estimation, log wages averaged per year")
    new_est <- feols(loghourly ~ AGE+age2|indiv_id+firm_id, data=d, fixef.rm="none")
    print(summary(new_est))
  }
  if (method=="Card"){
    print("starting AKM estimation, year fixed effect and age effect flat at 40")
    new_est <- feols(lnhwage ~ age_2+age_3|indiv_id+firm_id+year, data=d, fixef.rm="none")
    print(summary(new_est))
    
  }
  toc()
  new_est
}



merge_fe <- function(d, est, wfe_name="wfe", ffe_name="ffe", yfe_name="yfe", method="yearly mean"){
  tic("Merge FE")
  
  fe<-fixef(est)
  
  wfe <- data.frame(fe[1])
  names(wfe) <- wfe_name
  wfe$indiv_id <- rownames(wfe)
  
  ffe <- data.frame(fe[2])
  names(ffe) <- ffe_name
  ffe$firm_id <- rownames(ffe)
  
  ffe <- setDT(ffe)
  wfe <- setDT(wfe)
  
  d <- merge(d, wfe, by="indiv_id", all.x=T)
  d <- merge(d, ffe, by="firm_id", all.x=T)
  
  if (method=="Card"){
    yfe <- data.frame(fe[3])
    names(yfe) <- yfe_name
    yfe$year <- as.numeric(rownames(yfe))
    yfe <- setDT(yfe)
    d <- merge(d, yfe, by="year", all.x=T)
  }
  
  toc()
  
  d
}

get_u <- function(d, akm_estimate, method="yearly mean"){
  d <- setDT(d)[d$comp==1, "u" := resid(akm_estimate, na.rm=FALSE)]
  d[d$comp==1, "fitted" := fitted(akm_estimate, na.rm=FALSE)]
  if (method=="yearly mean"){
    d[, "Xb" := akm_estimate$coefficients[1]*AGE + 
        akm_estimate$coefficients[2]*age2]
  }
  if (method=="Card"){
    d[, "Xb" := akm_estimate$coefficients[1]*age_2 +
        akm_estimate$coefficients[2]*age_3]
  }
  d
}

split_est <- function(d, method="yearly mean", split_type="unbalanced"){
  tic("split estimations")
  d <- fast_split(d, split_type=split_type)
  
  d1 <- d[d$split==1,]
  tic("compfactor")
  d1$comp <- compfactor(list(factor(d1$indiv_id),factor(d1$firm_id)))
  toc()
  est1 <- akm_est(d1, method=method)
  d[d$split==1, "comp1" := d1$comp]
  d <- merge_fe(d, est1, "wfe1", "ffe1", "yfe1", method=method)
  d[(d$comp1==1)&(!is.na(d$wfe1)), "u1" := resid(est1)]
  d[(d$comp1==1)&(!is.na(d$wfe1)), "fitted1" := fitted(est1)]
  if (method=="yearly mean"){
    d[, "Xb1" := est1$coefficients[1]*AGE + est1$coefficients[2]*age2]
  }
  if (method=="Card"){
    d[, "Xb1" := est1$coefficients[1]*age_2 + est1$coefficients[2]*age_3]
  }
  
  
  d0 <- d[d$split==0,]
  tic("compfactor")  
  d0$comp <- compfactor(list(factor(d0$indiv_id),factor(d0$firm_id)))
  toc()
  est0 <- akm_est(d0, method=method)
  d[d$split==0, "comp0" := d0$comp]
  d <- merge_fe(d, est0, "wfe0", "ffe0", "yfe0", method=method)
  d[(d$comp0==1)&(!is.na(d$wfe0)), "u0" := resid(est0)]
  d[(d$comp0==1)&(!is.na(d$wfe0)), "fitted0" := fitted(est0)]
  
  if (method=="yearly mean"){
    d[, "Xb0" := est0$coefficients[1]*AGE + est0$coefficients[2]*age2]
    d[, "fX0" := Xb0+ffe0]
    d[, "fX1" := Xb1+ffe1]
    d[!is.na(d$wfe1), "IR0" := loghourly-fX0] #Individual residual with FFE and XB estimated on other split
    d[!is.na(d$wfe0), "IR1" := loghourly-fX1]
    d[(d$comp1==1)&(is.na(d$wfe1)), "r0" := loghourly-fX0-wfe0] # obs residual with prediction estimated from other split
    d[(d$comp0==1)&(is.na(d$wfe0)), "r1" := loghourly-fX1-wfe1]
  }
  if (method=="Card"){
    d[, "Xb0" := est0$coefficients[1]*age_2 + est0$coefficients[2]*age_3]
    d[, "fX0" := Xb0+ffe0+yfe0]
    d[, "fX1" := Xb1+ffe1+yfe1]
    d[!is.na(d$wfe1), "IR0" := lnhwage-fX0]
    d[!is.na(d$wfe0), "IR1" := lnhwage-fX1]
    d[(d$comp1==1)&(is.na(d$wfe1)), "r0" := lnhwage-fX0-wfe0]
    d[(d$comp0==1)&(is.na(d$wfe0)), "r1" := lnhwage-fX1-wfe1]
  }
  
 
  result <- list("data" = d,
                 "estimation1" = est1,
                 "estimation0" = est0)
  toc()
  
  result
}

all_est <- function(d, method="yearly mean"){
  tic("full sample estimation")
  tic("compfactor")
  d$comp <- compfactor(list(factor(d$indiv_id),factor(d$firm_id)))
  toc()
  est <- akm_est(d, method=method)
  d <- merge_fe(d, est, "wfe_all", "ffe_all", "yfe_all", method=method)
  d <- get_u(d, est, method=method)
  
  result <- list("data" = d, "estimation" = est)
  toc()
  
  result
}


# Variance decomposition without and with split sampling
########################################################
getvar <- function(d, var1, var2=""){
  if(var2!=""){
    du <- na.omit(d[,.SD,.SDcols = c(var1,var2)])
    r <- cov(du)[1,2]
    n <- nrow(du)
  }
  if(var2==""){
    du <- na.omit(d[,.SD,.SDcols = c(var1)])
    r <- var(du)
    n <- nrow(du)
  }
  
  c(r,n)
}

crossvar <- function(d, var1, var2, var3, var4){
  ra <- getvar(d, var1,var2)
  rb <- getvar(d, var3,var4)
  c((ra[1]+rb[1])/2, (ra[2]+rb[2]))
}

crosscov <- function(d, var1, var2, var3, var4){
  ra <- getvar(d, var1,var2)
  rb <- getvar(d, var3,var4)
  ra+rb
}

split_fast_decompo <- function(d, y_name="loghourly", method="yearly mean", 
                               split_method="none", approx_u=FALSE){
  tic("variance decomposition")
  names(d)[names(d)==y_name] <- "wagevar"
  
  r <- list()
  r$startyear <- c(min(d$year),NA)
  r$endyear <- c(max(d$year), NA)
  
  
  if(split_method=="none"){
    if((method=="yearly mean")|(r$startyear[1]==r$endyear[1])){
      d$yfe1 <- 0
      d$yfe0 <- 0
      d <- d[,c("wfe0","ffe0","Xb0","u0","r0"):=list(wfe_all,ffe_all,Xb,u,u)]
      d <- d[,c("wfe1","ffe1","Xb1","u1","r1"):=list(wfe_all,ffe_all,Xb,u,u)]
    }else{
      d <- d[,c("wfe0","ffe0","Xb0","u0","r0","yfe0"):=list(wfe_all,ffe_all,Xb,u,u,yfe_all)]
      d <- d[,c("wfe1","ffe1","Xb1","u1","r1","yfe1"):=list(wfe_all,ffe_all,Xb,u,u,yfe_all)]
    }
  }
  
  r$estimethod <- NA
  n_units <- nunits(d)
  r$nindiv <- c(n_units[2],n_units[1])
  r$nfirm <- c(n_units[3],n_units[1])
  r$nmover <- c(n_units[4],n_units[1])
  if(method=="yearly mean"){
    r$mean_lwage <- d[, mean(lnhwage, na.rm=TRUE)] 
  }else{
    r$mean_lwage <- d[, mean(wagevar, na.rm=TRUE)]
  }
  
  d[, c("mwage","ffe0","ffe1"):= list(mean(wagevar, na.rm=TRUE),
                                      mean(ffe0, na.rm=TRUE),
                                      mean(ffe1, na.rm=TRUE)), by=firm_id]
  d$diff_wage <- d$wagevar - d$mwage
  
  # Between-Within decomposition
  r$vary <- getvar(d, "wagevar")
  r$between <- getvar(d, "mwage")
  r$within <- getvar(d, "diff_wage")
  
  d[,c("varwithin") := list(var(wagevar, na.rm=TRUE)),
    by = firm_id] #intra-firm variance estimated with correction for sample size = nobs per firm
  r$sample_within <- c(mean(d$varwithin, na.rm=TRUE), length(d$varwithin))
  
  # Restriction to firms belonging to both main connected components
  d <- d[((!is.na(ffe1))&(!is.na(ffe0))),]
  
  n_units <- nunits(d)
  r$nindiv_comp <- c(n_units[2],n_units[1])
  r$nfirm_comp <- c(n_units[3],n_units[1])
  r$nmover_comp <- c(n_units[4],n_units[1])
  if(method=="yearly mean"){
    r$mean_lwage_comp <- d[, mean(lnhwage, na.rm=TRUE)] 
  }else{
    r$mean_lwage_comp <- d[, mean(wagevar, na.rm=TRUE)]
  }
  
  # For the split-firm method, we compute corrected estimates of var(wfe) and var(u)
  # with IR, the split "individual residual" wage1 - ffe0 - Xb0
  # For a given individual, we have var(IR0) = var(u) + var(err0) + covariances
  # with err0 the estimation error in ffe0+Xb0 
  # and the covariances null by hypothesis of the independence of the splits
  # We then have var(IR0) a slight overestimation of var(u)
  # but cov(wagevar, IR0) an estimation of var(u)
  # 
  if((split_method!="none")&(approx_u)){
    d[,c("covyIR0","covyIR1") := list(cov(IR0,wagevar, use = "pairwise.complete.obs"),
                                      cov(IR1,wagevar, use = "pairwise.complete.obs")),
      by = indiv_id] #covariance estim?e avec correction de sample size
    
    varu <- (mean(d$covyIR0, na.rm=TRUE)+mean(d$covyIR1, na.rm=TRUE))/2
    n1 <- nrow(na.omit(d[,.SD,.SDcols = "covyIR1"]))
    n0 <- nrow(na.omit(d[,.SD,.SDcols = "covyIR0"]))
    n_varu <- n1+n0
  }
  

  # Card, Heining, Kline 2013 decomposition
  # based on wagevar = wfe + ffe + Xb (+ yfe) + u
  r$var_ycomp <- getvar(d, "wagevar")
  r$var_FFE <- getvar(d, "ffe1", "ffe0")
  r$var_Xb <- getvar(d, "Xb1","Xb0")
  r$Twocov_WFE_FFE <- crosscov(d, "wfe0", "ffe1", "wfe1", "ffe0")
  r$Twocov_WFE_Xb <- crosscov(d, "wfe0", "Xb1", "wfe1", "Xb0")
  r$Twocov_FFE_Xb <- crosscov(d, "ffe0", "Xb1", "ffe1", "Xb0")
  r$Twocov_FFE_u <- crosscov(d, "ffe0", "u1", "ffe1", "u0")
  r$Twocov_Xb_u <- crosscov(d, "Xb0", "u1", "Xb1", "u0")
  
  if(split_method %in% c("periods","consec_periods")){
    r$var_WFE <- getvar(d, "wfe1","wfe0")
    r$var_u <- crossvar(d, "wagevar","r0", "wagevar", "r1")
    r$Twocov_WFE_u <- crosscov(d, "wfe0","u1", "wfe1", "u0")
  }else if(split_method %in% c("none")){
    r$var_WFE <- getvar(d, "wfe_all")
    r$var_u <- getvar(d, "u")
    r$Twocov_WFE_u <- crosscov(d, "wfe0","u1", "wfe1", "u0")
  }else{
    if(approx_u){
      r$var_WFE <- c(NA,NA)
      r$var_u <- c(varu, n_varu)
      r$Twocov_WFE_u <- c(NA,NA)
    }
    else{
      r$var_WFE <- c(NA,NA)
      r$var_u <- c(NA,NA)
      r$Twocov_WFE_u <- c(NA,NA)
    }
  }

  if((method=="yearly mean")|(r$startyear[1]==r$endyear[1])){
    d$yfe1 <- 0
    d$yfe0 <- 0
    r$var_YFE <- c(0,0)
    r$Twocov_WFE_YFE <- c(0,0)
    r$Twocov_FFE_YFE <- c(0,0)
    r$Twocov_Xb_YFE <- c(0,0)
    r$Twocov_u_YFE <- c(0,0)
  }else{
    r$var_YFE <- getvar(d, "yfe1", "yfe0")
    r$Twocov_WFE_YFE <- crosscov(d, "wfe0", "yfe1", "wfe1", "yfe0")
    r$Twocov_FFE_YFE <- crosscov(d, "ffe0", "yfe1", "ffe1", "yfe0")
    r$Twocov_Xb_YFE <- crosscov(d, "Xb0", "yfe1", "Xb1", "yfe0")
    r$Twocov_u_YFE <- crosscov(d, "r0", "yfe1", "r1", "yfe0")
  }
  
  # Song et al. 2019 decomposition
  # based on var(wagevar) = E(var(wagevar|firm)) + var(E(wagevar|firm))
  # crossd with Card et al. decomposition
  # We first need to compute wages and estimates' averages and residuals per firm
  d[, c("mwage","mwfe1","mwfe0","mXb1","mXb0","mu1","mu0","myfe1","myfe0")
    := list(mean(wagevar, na.rm=TRUE),mean(wfe1, na.rm=TRUE),mean(wfe0, na.rm=TRUE),
            mean(Xb1, na.rm=TRUE),mean(Xb0, na.rm=TRUE),mean(u1, na.rm=TRUE),
            mean(u0, na.rm=TRUE),mean(yfe1, na.rm=TRUE),mean(yfe0, na.rm=TRUE)),
    by=firm_id]

  # Between firms
  r$var_my <- getvar(d, "mwage", "mwage")
  r$var_mWFE <- getvar(d, "mwfe1", "mwfe0")
  r$var_mXb <- getvar(d, "mXb1", "mXb0")
  r$Twocov_mWFE_FFE <- crosscov(d, "mwfe0", "ffe1", "mwfe1", "ffe0")
  r$Twocov_mWFE_mXb <- crosscov(d, "mwfe0", "mXb1", "mwfe1", "mXb0")
  r$Twocov_FFE_mXb <- crosscov(d, "ffe0", "mXb1", "ffe1", "mXb0")
  
  if((method=="yearly mean")|(r$startyear[1]==r$endyear[1])){
    r$var_mYFE <- c(0,0)  
    r$Twocov_mWFE_mYFE <- c(0,0) 
    r$Twocov_mYFE_mXb <- c(0,0) 
    r$Twocov_FFE_mYFE <- c(0,0)
  }else{
    r$var_mYFE <- getvar(d, "myfe1", "myfe0") 
    r$Twocov_mWFE_mYFE <- crosscov(d, "mwfe0", "myfe1", "mwfe1", "myfe0")
    r$Twocov_mYFE_mXb <- crosscov(d, "mXb0", "myfe1", "mXb1", "myfe0")
    r$Twocov_FFE_mYFE <- crosscov(d, "ffe0", "myfe1", "ffe1", "myfe0")
  }

  # We then need to compute difference from mean per firm for wages and estimates
  d$diff_wage <- d$wagevar - d$mwage
  d$diff_wfe1 <- d$wfe1 - d$mwfe1
  d$diff_wfe0 <- d$wfe0 - d$mwfe0
  d$diff_IR1 <- d$mIR1 - d$mwfe1
  d$diff_IR0 <- d$mIR0 - d$mwfe0
  d$diff_Xb1 <- d$Xb1 - d$mXb1
  d$diff_Xb0 <- d$Xb0 - d$mXb0
  d$diff_u1 <- d$u1 - d$mu1
  d$diff_u0 <- d$u0 - d$mu0
  d$diff_yfe1 <- d$yfe1 - d$myfe1
  d$diff_yfe0 <- d$yfe0 - d$myfe0
  
  # Within firms
  #Song_within <- d[, list(cov(.SD, use="pairwise.complete.obs")),
  #                 .SDcols = c("diff_wage","diff_wfe1","diff_Xb1","diff_u1","diff_wfe0","diff_Xb0","diff_u0","diff_yfe1","diff_yfe0")]
  
  r$var_diffy <- getvar(d, "diff_wage")
  d[,c("sample_diff") := list(var(wagevar, na.rm=TRUE)),
    by = firm_id] #intra-firm variance estimated with correction for sample size = nobs per firm
  r$sample_diff <- c(mean(d$sample_diff, na.rm=TRUE), length(d$sample_diff))
  r$var_diffXb <- getvar(d, "diff_Xb1","diff_Xb0")
  r$Twocov_diffWFE_Xb <- crosscov(d, "diff_wfe0", "diff_Xb1", "diff_wfe1", "diff_Xb0")
  r$Twocov_diffXb_u <- crosscov(d, "diff_u0", "diff_Xb1", "diff_u1", "diff_Xb0")
  
  if(split_method %in% c("periods","consec_periods","none")){
    r$var_diffWFE <- getvar(d, "diff_wfe1", "diff_wfe0")
    r$var_diffu <- getvar(d, "diff_u1", "diff_u0")
    r$Twocov_diffWFE_u <- crosscov(d, "diff_wfe0", "diff_u1", "diff_wfe1", "diff_u0")
  }else{
    if(approx_u){
      r$var_diffWFE <- crossvar(d, "diff_IR1","diff_wfe0","diff_IR0","diff_wfe1")
      r$var_diffu <- c(NA, NA)
      r$Twocov_diffWFE_u <- c(NA,NA)
    }
    else{
    r$var_diffWFE <- c(NA,NA)
    r$var_diffu <- c(NA,NA)
    r$Twocov_diffWFE_u <- c(NA,NA)
  }}
  
  if((method=="yearly mean")|(r$startyear[1]==r$endyear[1])){
    r$var_diffYFE <- c(0,0)
    r$Twocov_diffWFE_YFE <- c(0,0)
    r$Twocov_diffXb_YFE <- c(0,0)
    r$Twocov_diffu_YFE <- c(0,0)
  }else{
    r$var_diffYFE <- getvar(d, "diff_yfe1", "diff_yfe0")
    r$Twocov_diffWFE_YFE <- crosscov(d, "diff_wfe0", "diff_yfe1", "diff_wfe1", "diff_yfe0")
    r$Twocov_diffXb_YFE <- crosscov(d, "diff_Xb0", "diff_yfe1", "diff_Xb1", "diff_yfe0")
    r$Twocov_diffu_YFE <- crosscov(d, "diff_u0", "diff_yfe1", "diff_u1", "diff_yfe0")
  }

  toc()
  
  as.data.frame(r)
}


# double-FE regressions followed by variance decomposition
##########################################################

saverepdt <- saverep

decompo <- function(d, y_name, method, split_metho="none"){
  if (split_metho=="none"){
    res <- split_fast_decompo(d, y_name=y_name, method=method)
  }
  else{
    res <- split_fast_decompo(d, y_name=y_name, method=method, split_method=split_metho)
  }
  res
}

estim <- function(d, need_estim=TRUE, startyear, split_metho="none", metho="yearly mean", sizelim=1, p=6,
                  yname="loghourly", rotat_year=TRUE, groupvar="", saveopt=TRUE, savn, results, 
                  return_data=FALSE, cluster=FALSE, statdes=TRUE){
  tic("estim")
  if (need_estim){
    d <- d[d$firmsize>=sizelim,]
    if (split_metho=="none"){
      res <- all_est(d, method=metho)
      age_coeff <- as.list(res$estimation$coefficients)
      ar2 <- as.list(r2(res$estimation, type = "ar2"))
      }
    else{
      res <- split_est(d, split_type = split_metho, method=metho)
      age_coeff <- as.list((res$estimation1$coefficients+res$estimation0$coefficients)/2)
      ar2 <- as.list((r2(res$estimation1, type = "ar2")+r2(res$estimation0, type = "ar2"))/2)
    }
  df <- res$data
  rm(res)
  }else{df<-d}
  
  if (cluster){
    df[,cluster:=firm_id]
    df[,firm_id:=firm]
  }
  
  decompol <- decompo(df, y_name=yname, method=metho, split_metho=split_metho)
  methoname <- paste(split_metho, sizelim)
  decompol$estimethod <- methoname
  decompol <- cbind(decompol, age_coeff)
  decompol <- cbind(decompol, ar2)
  
  if (rotat_year){
    for (i in c(1:p)){
      decompoly <- decompo(df[year==startyear-1+i,], y_name=yname, method=metho, split_metho=split_metho)
      methoname <- paste(split_metho, sizelim, "by_year", sep="")
      decompoly$estimethod <- methoname
      decompoly <- cbind(decompoly, age_coeff)
      decompoly <- cbind(decompoly, ar2)
      
      
      decompol <- rbind(decompol, decompoly, fill=TRUE)
    }
  }
  
  if (!(groupvar=="")){
    names(df)[names(df)==groupvar] <- "group"
    print(table(df$group))
    grouplist <- unique(df[,group])
    # within groups
    for (i in grouplist){
      decompolg <- decompo(df[group==i,], y_name=yname, method=metho, split_metho=split_metho)
      methoname <- paste(split_metho, sizelim, "by_", groupvar, "_", i, sep="")
      decompolg$estimethod <- methoname
      decompolg <- cbind(decompolg, age_coeff)
      decompolg <- cbind(decompolg, ar2)
      

      decompol <- rbind(decompol,decompolg, fill=TRUE)
    }
    # between groups
    df$firm_save <- df$firm_id
    df$firm_id <- df$group
    decompolg <- decompo(df, y_name=yname, method=metho, split_metho=split_metho )
    methoname <- paste(split_metho, sizelim, "_between_", groupvar, sep="")
    decompolg$estimethod <- methoname
    decompolg <- cbind(decompolg, age_coeff)
    decompolg <- cbind(decompolg, ar2)
    decompol <- rbind(decompol,decompolg, fill=TRUE)
    df$firm_id <- df$firm_save
  }
  
  savename <- paste(savn,methoname,sep="")
  if(saveopt==TRUE){
    write.csv(decompol, file=paste(savename,".csv",sep=""))
  }
  
  if (statdes==TRUE){
    tic("Stats after AKM")
    stat <- statdes(d)
    savenamesex <- paste(savename,"sex",".csv", sep="")
    write.csv(stat$sex, file=savenamesex)
    savenamedep <- paste(savename,"dep",".csv", sep="")
    write.csv(stat$dep, file=savenamedep)
    savenamereg <- paste(savename,"reg",".csv", sep="")
    write.csv(stat$reg, file=savenamereg)
    savenamesize <- paste(savename,"size",".csv", sep="")
    write.csv(stat$size, file=savenamesize)
    savenamebetween <- paste(savename,"between",".csv", sep="")
    write.csv(stat$between, file=savenamebetween)
    savenameobs <- paste(savename,"obs",".csv", sep="")
    write.csv(stat$obs, file=savenameobs)
    
    detail_stat <- detailed_statdes(d)
    savenameobs <- paste(savename,"detailed_stat_des",".csv", sep="")
    write.csv(detail_stat, file=savenameobs)
    toc()
  }
  
  if (return_data){
    if (cluster){
      df[, firm:=firm_id]
      df[, firm_id:=cluster]
    }
    return(list(decompol, df))
    toc()
  }else{
    rm(df)
    gc()
    toc()
    return(list(decompol))}
}

fast_panel <- function(p=6, startyear=2002, saverep = saverepdt,
                       estab=FALSE, cluster=FALSE, statdes=TRUE, saveopt=TRUE, 
                       metho="yearlymean", agemin=16, agemax=70, dureemin=360, 
                       sizemin=1, assoc=TRUE, savenametag=""){
  tic("building panel")
  panel <- list()
  for (y in 1:p){
    print(paste("reading file for year ", y-1+startyear))
    panel[[y]] <- read.fst(paste(fstrep, y-1+startyear, ".fst", sep=""))
  }
  print("loading done, binding years in panel")
  d <- rbindlist(panel)
  rm(panel)
  d <- d[DUREE>=dureemin,]
  if(!assoc){
    d <- d[CJ!=9,]
  }
  if(estab){
    names(d)[names(d)=="firm_id"] <- "firm"
    names(d)[names(d)=="est"] <- "firm_id"
  }
  # currently, cluster option only works for periods 2002-2007, 2008-2013 and 2014-2019
  # and firms with more than 20 obs
  if(cluster){
    period <- round((startyear-2002)/6)+1
    if(sizemin==20){
      size <- "_20"
    }else{
      size <- ""
    }
    clusters <- read.csv(paste(clusterrep,"estimatedclusters_period",period,size,".csv",sep=""),
                         sep=" ",header=F)
    names(clusters) <- c("firm_id","cluster")
    clusters$firm_id <- str_sub(clusters$firm_id, 2, -1)
    d <- merge(d,clusters,by="firm_id")
    d[, firm:=firm_id]
    d[, firm_id:=as.character(cluster)]
    print(summary(d$firm))
  }
  # age
  d <- d[AGE<=agemax,]
  d <- d[AGE>=agemin,]
  d[, c("age_2","age_3") := list((AGE-40)^2,(AGE-40)^3)]
  # min number of indiv per firm per panel
  d[, firmsize := uniqueN(indiv_id), by=firm_id]
  d <- d[d$firmsize>=sizemin,]
  if (cluster|estab){
    d <- d[,.(year, AGE, age2, age_2, age_3, indiv_id, firm_id, loghourly, lnhwage, SEXE, DEPT, REGT, yearlyfirmsize, firmsize, occ, ind, firm)]
  } else{
    d <- d[,.(year, AGE, age2, age_2, age_3, indiv_id, firm_id, loghourly, lnhwage, SEXE, DEPT, REGT, yearlyfirmsize, firmsize, occ, ind)]
  }
  toc()
    
  savn <- paste(saverep,savenametag,startyear,startyear+p-1,metho, sep="")
    
  if (statdes==TRUE){
    tic("Stats before AKM")
    stat <- statdes(d)
    savenamesex <- paste(savn,"sex",".csv", sep="")
    write.csv(stat$sex, file=savenamesex)
    savenamedep <- paste(savn,"dep",".csv", sep="")
    write.csv(stat$dep, file=savenamedep)
    savenamereg <- paste(savn,"reg",".csv", sep="")
    write.csv(stat$reg, file=savenamereg)
    savenamesize <- paste(savn,"size",".csv", sep="")
    write.csv(stat$size, file=savenamesize)
    savenamebetween <- paste(savn,"between",".csv", sep="")
    write.csv(stat$between, file=savenamebetween)
    savenameobs <- paste(savn,"obs",".csv", sep="")
    write.csv(stat$obs, file=savenameobs)
    
    detail_stat <- detailed_statdes(d)
    savenameobs <- paste(savn,"detailed_stat_des",".csv", sep="")
    write.csv(detail_stat, file=savenameobs)
    toc()
  }
    
#  d <- d[,.(year, AGE, age2, age_2, age_3, indiv_id, firm_id, loghourly, lnhwage, firmsize)]
    
  d
}

rotestim <- function(built_panel=NULL, period_length=6, start_year_list=c(2002, 2008, 2014), saverep = saverepdt,
                     method_list=c("AKM","fsplit","psplit"), estab=FALSE, cluster=FALSE,
                     rotat_year=TRUE, year_method="yearlymean", statdes=TRUE, saveopt=TRUE,
                     agemin=16, agemax=70, dureemin=360, sizemin=1, assoc=TRUE, return_panel=FALSE, 
                     save_estim=FALSE, savenametag=""){
  results <- list()
  
  if (year_method=="yearlymean"){
    metho <- "yearly mean"
    yname <- "loghourly"
  }
  if (year_method=="yearFE"){
    metho <- "Card"
    yname <- "lnhwage"
  }
  
  p <- period_length
  
  for (startyear in start_year_list){
    if (is.null(built_panel)){
      d <- fast_panel(p=period_length, startyear=startyear, saverep = saverep,
                      estab=estab, cluster=cluster, statdes=statdes, saveopt=saveopt, 
                      metho=metho, agemin=agemin, agemax=agemax, dureemin=dureemin, 
                      sizemin=sizemin, assoc=assoc, savenametag=savenametag)
    }else{
      d <- built_panel
    }
    
    d$firm_id <- as.character(d$firm_id)
    d$indiv_id <- as.character(d$indiv_id)
    
    savn <- paste(saverep,savenametag,startyear,startyear+period_length-1,metho, sep="")
   
    if ("AKM" %in% method_list){
      tic("AKM all")
      esti <- estim(d, startyear=startyear, split_metho="none", metho=metho, sizelim=sizemin, p=period_length, 
                    rotat_year=rotat_year, saveopt=saveopt, savn=savn, yname=yname, return_data=save_estim, cluster=cluster, statdes=statdes)
      results <- rbind(setDT(results), setDT(esti[[1]]), fill=TRUE)
      if(save_estim==TRUE){
        write_parquet(
        x = esti[[2]],
        sink = paste(savn, "_nosplit.parquet", sep=""))
      }
      toc()
    }
    
    if ("fsplit" %in% method_list){
      tic("Firm split AKM")
      esti <- estim(d, startyear=startyear, split_metho="by_firm_movers", metho=metho, sizelim=sizemin, p=period_length, 
                    rotat_year=rotat_year, saveopt=saveopt, savn=savn, yname=yname, return_data=save_estim, cluster=cluster, statdes=statdes)
      results <- rbind(setDT(results), setDT(esti[[1]]), fill=TRUE)
      if(save_estim==TRUE){
        write_parquet(
          x = esti[[2]],
          sink = paste(savn, "_fsplit.parquet", sep=""))
      }
      toc()
    }
    
    if ("psplit" %in% method_list){
      tic("Period split AKM")
      esti <- estim(d, startyear=startyear, split_metho="consec_periods", metho=metho, sizelim=sizemin, p=period_length, 
                    rotat_year=rotat_year, saveopt=saveopt, savn=savn, yname=yname, return_data=save_estim, cluster=cluster, statdes=statdes)
      results <- rbind(setDT(results), setDT(esti[[1]]), fill=TRUE)
      if(save_estim==TRUE){
        write_parquet(
          x = esti[[2]],
          sink = paste(savn, "_psplit.parquet", sep=""))
      }
      toc()
    }
  }
  gc()
  dates <- paste(start_year_list, collapse="-")
  savename <- paste(saverep,savenametag,metho,dates,"_Allres_",p,"years_panel",".csv", sep="")
  if(saveopt==TRUE){
    write.csv(results, file=savename)
    }
  if (return_panel){
    return(list(d, results))  
  }else{
    rm(d)
    gc()
    return(list(results))
  }
  
}


multisplit <- function(iter=20, year=2002, ...){
  results <- list()
  for (i in c(1:iter)){
    saverepiter <- paste(saverepdt,year,"multi",i,"/", sep="")
    iterdir <- file.path(saverepiter) 
    if (!dir.exists(iterdir)) dir.create(iterdir)
    if(i==1){
      res <- rotestim(built_panel=NULL, method_list = c("fsplit"), rotat_year = FALSE, start_year_list=c(year),
                      statdes=FALSE, saveopt=TRUE, saverep = saverepiter, return_panel=TRUE, ...)
      d <- res[[1]]
      res <- res[[2]]
    }else{
      res <- rotestim(built_panel=d, method_list = c("fsplit"), rotat_year = FALSE, start_year_list=c(year),
                      statdes=FALSE, saveopt=TRUE, saverep = saverepiter, return_panel=FALSE, ...)[[1]]
    }
    results <- rbind(results, res)
  }
  savename <- paste(saverep, "multi",i,".csv", sep="")
  write.csv(results, file = savename)
  results
}
  


############################
# Fixed Effects Regression #
############################

# FE (fixed effects) are computed with an AKM model
# We start from the complete result file from this model for each period.

# In is not necessary to split-correct before FE regression, since we can get 
# correct regression parameter estimates from noisy FE estimates, as in any
# regression

# However, if we use these regression results for wage variance decomposition
# we come back to the limited mobility bias, and the split correction here might
# be complex

cleanestim <- function(d, split_correction=FALSE){
  #This function takes the data file exported from an AKM model estimation 
  #and prepares it for fixed-effects regressions
  
  nomrep <- nomenclatures
  naf <- read.csv2(paste(nomrep, "naf2008_liste_n1.csv", sep=""))
  nafind <- read.csv2(paste(nomrep, "naf2008_5_niveaux.csv", sep=""))
  pcs <- read.csv2(paste(nomrep, "Nomenclature_N2_PCS2020.csv", sep=""))
  names(pcs) <- c("code","label","label_short")
  names(naf) <- c("code","label")
  nafind <- nafind[,c("NIV1","NIV3")]
  nafind$NIV3 <- substr(nafind$NIV3,1,2)
  nafind <- unique(nafind)
  
  d[!(occ %in% pcs$code), occ:=NA]
  d[!(ind %in% nafind$NIV3), ind:=NA]
  d$ind <- as.character(d$ind)
  d$occ <- as.character(d$occ)

  if (split_correction){
    d[,wfe_all:=rowMeans(.SD, na.rm=T),.SDcols = c("wfe0","wfe1")]
    d[,ffe_all:=rowMeans(.SD, na.rm=T),.SDcols = c("ffe0","ffe1")]
  }
  
  clean <- d[(!is.na(wfe_all))&(!is.na(ffe_all)),]
  clean <- merge(clean, nafind, by.x="ind", by.y="NIV3", all.x=TRUE)
  names(clean)[names(clean)=="NIV1"] <- "naf1"
  
  # On centre les effets fixes
  mwfe <- mean(clean$wfe_all, na.rm=TRUE)
  mffe <- mean(clean$ffe_all, na.rm=TRUE)
  
  clean$wfe <- clean$wfe_all - mwfe
  clean$ffe <- clean$ffe_all - mffe
  
  return(clean)
}

fereg_period <- function(d, 
                         save_estimated_coefficients=F, 
                         savename="fereg", 
                         yearref="02"){
  # This function estimates fixed effects regressions and saves the results, for a given period
  
  d[, c("naf1", "occ"):= list(relevel(as.factor(naf1), ref = "G"),relevel(as.factor(occ), ref = "55"))]
  d[, "naf1":=recode(naf1, "A"="AC", "B"="AC", "C"="AC",
                     "D"="DE", "E"="DE", 
                     "L"="LM", "M"="LM",
                     "O"="OPQ", "P"="OPQ", "Q"="OPQ", 
                     "S"="STU", "T"="STU", "U"="STU")]
  d[, "occ":=recode(occ, "10"="20", "21"="20", "22"="20", "23"="20", 
                    "31"="32","33"="32", "34"="32", "44"="32")]
  occ <- table(d$occ)
  naf <- table(d$naf1)
  sexe <- table(d$SEXE)
  
  wfereg <- feols(wfe~SEXE+naf1+occ, cluster = ~ firm_id, 
                  data=d)
  ffereg <- feols(ffe~SEXE+naf1+occ, cluster = ~ firm_id, 
                  data=d)
  
  print(summary(wfereg))
  print(summary(ffereg))
  
  mw1 <- rbind(wfereg$coefficients, wfereg$se)
  mf1 <- rbind(ffereg$coefficients, ffereg$se)
  
  if (save_estimated_coefficients)
    write_parquet(
      x = d,
      sink = paste(saverep, savename, ".parquet", sep="")
    )
  
  save(mw1, file = paste(saverep, "reg_rez_wfe_",yearref,".RData", sep=""))
  save(mf1, file = paste(saverep, "reg_rez_ffe_",yearref,".Rdata", sep=""))
  save(occ, file = paste(saverep, "occupation_table",yearref,".RData", sep=""))
  save(naf, file = paste(saverep, "industry_table",yearref,".RData", sep=""))
  save(sexe, file = paste(saverep, "sexe_table",yearref,".RData", sep=""))
}

occ_FE <- function(d, saveref){
  #  ''
  #  regress individual FE on occupation and saves the FE and residuals in the datafile
  #  ''
  d <- data.table(d)
  # Regression for split 0
  lm_w0 <- lm(wfe0 ~ occ, data = d[!(is.na(wfe0))&!(is.na(occ)),])
  
  summary_lm_w0 <- summary(lm_w0)
  print(summary_lm_w0)
  
  d <- d[!(is.na(wfe0))&!(is.na(occ)), epsilon_w0:=residuals(lm_w0)]
  
  r20 <- summary_lm_w0$r.squared
  
  tidy_lm_w0 <- tidy(lm_w0)
  omega0 <- tidy_lm_w0[,c(1,2)]
  names(omega0) <- c("occ","omega0")
  d <- merge(d, omega0, by="occ", all.x=T)
  
  rm(lm_w0)
  rm(summary_lm_w0)
  gc()
  

  # Regression for split 1
  lm_w1 <- lm(wfe1 ~ occ, data = d[!(is.na(wfe1))&!(is.na(occ)),])
  
  summary_lm_w1 <- summary(lm_w1)
  print(summary_lm_w1)
  
  d <- d[!(is.na(wfe1))&!(is.na(occ)), epsilon_w1:=residuals(lm_w1)]
  
  r21 <- summary_lm_w1$r.squared
  
  tidy_lm_w1 <- tidy(lm_w1)
  omega1 <- tidy_lm_w1[,c(1,2)]
  names(omega1) <- c("occ","omega1")
  d <- merge(d, omega1, by="occ", all.x=T)
  rm(lm_w1)
  rm(summary_lm_w1)
  gc()
  
  save(tidy_lm_w0, file = paste(saverep, "reg_occ_wfe_0",saveref,".RData", sep=""))
  save(tidy_lm_w1, file = paste(saverep, "reg_occ_wfe_1",saveref,".RData", sep=""))
  
  return(d)
}

keepoccstat <- function(d){
  # this function produces summary statistics by occupation and split
  # that can be used to describe the role of occupations in var(wfe)
  d <- d[, c("ofe0","ofe1") := list(mean(wfe0, na.rm=TRUE),mean(wfe1, na.rm=TRUE)), by = "occ"]
  occstat <- d[,list(mean(wfe0, na.rm=TRUE),
                     mean(wfe1, na.rm=TRUE),
                     .N,
                     var(wfe0, na.rm=TRUE),
                     var(wfe1, na.rm=TRUE)),
               by = "occ"]
  occ_comp0 <- d[(comp0==1),.N, by = .(occ, comp0)]
  occ_comp1 <- d[(comp1==1),.N, by = .(occ, comp1)]
  occstat <- merge(occstat, occ_comp0[,.(occ,N)], by="occ")
  occstat <- merge(occstat, occ_comp1[,.(occ,N)], by="occ")
  names(occstat) <- c("occ","occ_fe_0","occ_fe_1","n_obs","var_wfe0","var_wfe1","n_obs_comp0","n_obs_comp1")
  
  allstat <- d[,list(mean(wfe0, na.rm=TRUE),
                     mean(wfe1, na.rm=TRUE),
                     .N,
                     var(wfe0, na.rm=TRUE),
                     var(wfe1, na.rm=TRUE),
                     sum(comp0==1, na.rm=TRUE),
                     sum(comp0==1, na.rm=TRUE))]
  names(allstat) <- c("occ_fe_0","occ_fe_1","n_obs","var_wfe0","var_wfe1","n_obs_comp0","n_obs_comp1")
  allstat$occ <- "All occ"
  
  occstat <- rbind(occstat, allstat)
  
  return(occstat)
}

occ_decomp <- function(d){
  # this function recycles the main decomposition function to compute occupational sorting and segregation
  # a quick and dirty way is to replace Xb (age effects) with occupational effects
  
  de <- d[, c("Xb0","Xb1") := list(mean(wfe0, na.rm=TRUE),mean(wfe1, na.rm=TRUE)), by = "occ"]
  de <- de[, c("wfe0","wfe1") := list(wfe0-Xb0,wfe1-Xb1)]
  dec <- split_fast_decompo(de, method = "Card", split_method = "firm", y_name="lnhwage", approx_u=TRUE)
  
  rm(de)
  gc()
  
  names(dec) <- c("startyear", "endyear", "estimethod", "nindiv", "nfirm", "nmover", 
                  "mean_lwage", "vary", "between", "within", "sample_within", "nindiv_comp", 
                  "nfirm_comp", "nmover_comp", "mean_lwage_comp", "var_ycomp", "var_FFE", "var_OccFE", 
                  "Twocov_WFEres_FFE", "Twocov_WFEres_OccFE", "Twocov_FFE_OccFE", "Twocov_FFE_u", "Twocov_OccFE_u", "var_WFEres", 
                  "var_u", "Twocov_WFEres_u", "var_YFE", "Twocov_WFEres_YFE", "Twocov_FFE_YFE", "Twocov_OccFE_YFE", 
                  "Twocov_u_YFE", "var_my", "var_mWFEres", "var_mOccFE", "Twocov_mWFEres_FFE", "Twocov_mWFEres_mOccFE", 
                  "Twocov_FFE_mOccFE", "var_mYFE", "Twocov_mWFEres_mYFE", "Twocov_mYFE_mOccFE", "Twocov_FFE_mYFE", "var_diffy", 
                  "sample_diff", "var_diffOccFE", "Twocov_diffWFEres_OccFE", "Twocov_diffOccFE_u", "var_diffWFEres", "var_diffu", 
                  "Twocov_diffWFEres_u", "var_diffYFE", "Twocov_diffWFEres_YFE", "Twocov_diffOccFE_YFE", "Twocov_diffu_YFE")  
  
  return(dec)
}



###################
# Firm demography #
###################

# Build firm-level datasets from the job-level results:

firmdt <- function(d){
  tic("firm-level averages")
  # centering fixed effects
  mwfe <- mean(d$wfe_all, na.rm=TRUE)
  mffe <- mean(d$ffe_all, na.rm=TRUE)
  
  d$wfe <- d$wfe_all - mwfe
  d$ffe <- d$ffe_all - mffe
  
  df <- d[, list(.N, mean(SEXE=="2", na.rm=TRUE), mean(AGE, na.rm=TRUE), mean(firmsize, na.rm=TRUE), 
                 min(year, na.rm=TRUE), max(year, na.rm=TRUE), mean(ffe, na.rm=TRUE), 
                 mean(wfe, na.rm=TRUE)),
          by=firm_id]
  names(df) <- c("firm_id","nobs","prop_women","mean_age","mean_size","first_year","last_year","ffe","mwfe")
  #d[,"occ":=as.factor(occ)]
  #df <- merge(df, d[,as.list(table(occ)/.N),by="firm_id"], by="firm_id")
  
  df <- merge(df, d[,names(which.max(table(ind))),by=firm_id], by="firm_id")
  names(df)[names(df)=="V1"] <- "ind"
  df <- merge(df, d[,table(ind)[which.max(table(ind))]/.N,by=firm_id], by="firm_id")
  names(df)[names(df)=="V1"] <- "ind_coherence"
  toc()

  return(df)
}

############################
# wage quantiles evolution #
############################

yearlyselect <- function(year, minsize=1, minage=16, maxage=70, minduree=360){
  d <- as.data.table(read.fst(paste(fstrep, year, ".fst", sep="")))
  d <- d[d$yearlyfirmsize>=minsize,]
  d <- d[AGE<=maxage,]
  d <- d[AGE>=minage,]
  d <- d[DUREE>=minduree,]
  return(d)
}

period_quantile <- function(startyear, endyear, minsize=1, minage=16, maxage=70, minduree=360){
  quantl <- list()
  for (ye in startyear:endyear){
    print(paste("quantiles for year", ye))
    d <- yearlyselect(year=ye, minsize=minsize, minage=minage, maxage=maxage, minduree=minduree)
    q <- data.frame(quantile(d$lnhwage, probs = c(0.01, 0.1, 0.5, 0.9, 0.99)))
    quantl <- append(quantl, q)
  }
  quantl <- data.frame(quantl)
  names(quantl) <- as.character(startyear:endyear)
  return(quantl)
} 

##################################
# Historical series narrow panel #
##################################

hist_dataprep <- function(d){
  tic("historical series dataprep")
  # Aligning variable names on exhaustive files
  names(d)[names(d)=="hours"] <- "NBHEUR"
  names(d)[names(d)=="days"] <- "DUREE"
  names(d)[names(d)=="sex"] <- "SEXE"
  names(d)[names(d)=="age_2"] <- "age2"

  d <- d[DUREE>=90,]
  d <- d[earnings_gross_r>0,]
  d <- d[(d$earnings_net_r<d$earnings_gross_r)&(d$earnings_gross_r<10*d$earnings_net_r),]
  
  d <- na.omit(d, cols=c("AGE", "age2", "indiv_id", "firm_id", "year", "earnings_gross_r"))
  
  d[,lnhwage := log(earnings_gross_r/NBHEUR)]
  d[,lndwage := log(earnings_gross_r/DUREE)]
  d[,lnhwagen := log(earnings_net_r/NBHEUR)]
  d[,lndwagen := log(earnings_net_r/DUREE)]
  
  d[, hyearmean := mean(lnhwage, na.rm=TRUE), by = year]
  d[fulltime==1, dyearmean := mean(lndwage, na.rm=TRUE), by = year] # Daily wage series computed only on fulltime workers
  d[, hnyearmean := mean(lnhwagen, na.rm=TRUE), by = year]
  d[fulltime==1, dnyearmean := mean(lndwagen, na.rm=TRUE), by = year] # Daily wage series computed only on fulltime workers
  
  d[, loghourly := lnhwage - hyearmean]
  d[, logdaily := lndwage - dyearmean]
  d[, loghourlyn := lnhwagen - hnyearmean]
  d[, logdailyn := lndwagen - dnyearmean]
  
  d[, yearlyfirmsize := uniqueN(indiv_id), by=firm_id]
  
  toc()
  
  d
}

varselect <- function(d,typsal="hourly"){
  if (typsal=="hourlynet"){
    d <- d[NBHEUR>100,]
    d <- d[d$lnhwagen>(log(d$threshold)+hnyearmean-hyearmean),]
    d <- d[d$lnhwagen<(log((d$smic*1000))+hnyearmean-hyearmean),]
    d$varsalc <- d$loghourlyn
    d$varsal <- d$lnhwagen
  }
  else if (typsal=="hourly"){
    d <- d[NBHEUR>100,]
    d <- d[d$lnhwage>log(d$threshold),]
    d <- d[d$lnhwage<log((d$smic*1000)),]
    d$varsalc <- d$loghourly
    d$varsal <- d$lnhwage
  }
  else if (typsal=="dailynet"){
    d <- d[d$fulltime==1,]
    d <- d[d$lndwagen>(log(d$smic*4)+dnyearmean-dyearmean),]
    d <- d[d$lndwagen<(log((d$smic*8000))+dnyearmean-dyearmean),]
    d$varsalc <- d$logdailyn
    d$varsal <- d$lndwagen
  }
  else if (typsal=="daily"){
    d <- d[d$fulltime==1,]
    d <- d[d$lndwage>log(d$smic*4),]
    d <- d[d$lndwage<log((d$smic*8000)),]
    d$varsalc <- d$logdaily
    d$varsal <- d$lndwage
  }
  d <- d[!is.na(d$varsalc),]
  d$loghourly <- d$varsalc
  d$lnhwage <- d$varsal
  d
}

hist_build_panel <- function(d=panel, p=6, startyear=1996, agemin=16, agemax=70, dureemin=360, 
                       sizemin=1, wage="hourly"){
  tic("building panel")
  d <- d[(year>=startyear)&(year<startyear+p)]
  d <- d[DUREE>=dureemin,]
  # age
  d <- d[AGE<=agemax,]
  d <- d[AGE>=agemin,]
  d[, c("age_2","age_3") := list((AGE-40)^2,(AGE-40)^3)]
  
  d <- varselect(d, typsal=wage)
  
  # min number of indiv per firm per panel
  d[, firmsize := uniqueN(indiv_id), by=firm_id]
  d <- d[d$firmsize>=sizemin,]
  toc()
  
  d <- d[,.(year, AGE, age2, age_2, age_3, indiv_id, firm_id, loghourly, lnhwage, firmsize)]
  
  d
}

hist_rotmulti <- function(sourcefile=panel, iter=20, yearlist=c(1996:2014), wage="hourly", agemin=16, 
                            agemax=70, dureemin=90, metho= "fsplit", year_method="yearFE",
                            sizemin=1, p=6, ...){
  results <- list()
  saverepiter <- paste(saverepdt,"multi_historic_series_",wage,"/", sep="")
  iterdir <- file.path(saverepiter) 
  if (!dir.exists(iterdir)) dir.create(iterdir)
  for (y in yearlist){
    print(y)
    d <- hist_build_panel(d=sourcefile, startyear=y, p=p, wage=wage, agemin=agemin, 
                        agemax=agemax, dureemin=dureemin, sizemin=sizemin)
    for (i in c(1:iter)){
      print(i)
      savnt <- paste("multi",i,wage,sep="_")
      res <- rotestim(built_panel=d, method_list = c(metho), year_method=year_method, rotat_year = FALSE, 
                      start_year_list=c(y), statdes=FALSE, saveopt=FALSE, saverep = saverepiter, 
                      agemin=agemin, savenametag=savnt, agemax=agemax, dureemin=dureemin, sizemin=sizemin, return_panel=FALSE, ...)[[1]]
      results <- rbind(results, res)
    }
    savename <- paste(saverepiter,wage,"_",metho,"multi_",iter,"_",y,".csv", sep="")
    write.csv(results, file = savename)
  }
  savename <- paste(saverepiter,wage,"_",metho,"_multi_rot_",iter,yearlist[[1]],"_",y,".csv", sep="")
  write.csv(results, file = savename)
  results
}

###################
# Data simulation #
###################
# Simulated data is used both to test the code, and to test the split sampling correction results on realistic simulations

# Simulation from scratch (allowing to test the code without access to original, gated data)
mobsim <- function(nworkers=10000, 
                   nfirms=100, 
                   mobility=0.2,    # share of mobile workers in panel
                   nyears=6,        # length of panel in years
                   startyear=2002,
                   r=0.5,           # parameter for correlation between worker and firm fixed effects 
                   var_wfe=0.15,    # Parameter for variance of WFE 
                   var_ffe=0.01,    # Parameter for variance of FFE 
                   var_u=0.01,      # Parameter for variance of u 
                   var_yfe=0,       # Parameter for variance of year fixed effects
                   par_cor_u=0,     # Parameter for correlation u_t with u_t-1 for non-movers
                   par_age=0.004,    # Parameter for variable AGE in equation
                   par_age2=-0.0004,  # Parameter for variable (AGE-40)^2 in equation
                   par_age3=0.000008,  # Parameter for variable (AGE-40)^2 in equation
                   ytrend = 0    # Parameter for trend in year effects
                   ){
  p <- 1-((1-mobility)^(1/nyears)) # yearly probability of move to get the required level of overall mobility
  
  yfe <- rnorm(nyears)
  
  ident <- c(1:nworkers)
  AGE <- as.integer(runif(nworkers)*56+13)
  SEXE <- sample(1:2, size=nworkers, replace=TRUE)
  pcs <- read.csv2(paste(nomenclatures, "Nomenclature_N2_PCS2020.csv", sep=""))
  names(pcs) <- c("code","label","label_short")
  occ <- as.character(sample(pcs$code, size=nworkers, replace=TRUE))
  wfe <- rnorm(nworkers)
  sort <- rnorm(nworkers)+r*wfe
  wfe <- wfe[order(sort)]          # indiv and firm order will be used to simulate sorting
  di <- data.table(ident, AGE, SEXE, occ, wfe)

  firm <- c(1:nfirms)
  ffe <- rnorm(nfirms)
  ffe <- ffe[order(ffe)]
  DEPT <- sample(1:100, size=nfirms, replace=TRUE)
  REGT <- as.integer(DEPT/10)
  nafind <- read.csv2(paste(nomenclatures, "naf2008_5_niveaux.csv", sep=""))
  ind <- sample(substr(nafind$NIV3,1,2), size=nfirms, replace=TRUE)
  CATJUR <- sample(1:9, size=nfirms, replace=TRUE)
  est <- firm     # For simplicity we don't simulate distinct establishments
  nb_workers <- as.integer(exp(runif(nfirms)*10))
  df <- data.table(firm, ffe, DEPT, REGT, ind, CATJUR, est, nb_workers)
  
  
  # Year 1 :
  yfirm <- as.integer(as.factor(sample(firm,nworkers,replace=TRUE,prob=plnorm(nb_workers))))
  yfirm <- yfirm[order(yfirm)]
  d <- di
  d$firm <- yfirm
  d$year=startyear
  d$u <- rnorm(nworkers)
  d$yfe <- yfe[1]
  dy <- d
  
  # Year n :
  for (year in 1:(nyears-1)){
    dy$year <- startyear+year
    dy[, AGE := AGE+1]
    dy$yfe <- yfe[year+1]+year*ytrend
    movers <- (runif(nworkers)<p)*1
    nmovers <- sum(movers)
    yfirm <- as.integer(as.factor(sample(firm,nmovers,replace=TRUE,prob=plnorm(nb_workers))))
    sort <- rnorm(nmovers)+r*(yfirm/nmovers)
    yfirm <- yfirm[order(sort)]
    dy[movers==1, firm := yfirm]
    u2 <- rnorm(nmovers)
    dy[movers==1, u := u2]
    u3 <- rnorm(nworkers-nmovers)
    dy[movers==0, u := par_cor_u*u+(1-par_cor_u)*u3] # correlates residuals for non-movers
    dy[movers==0, u := u/sd(u)]
    d <- rbind(d, dy)
  }
  
  d <- merge(d,df,by="firm",all.x=T)
  d$wfe<-(var_wfe**0.5)*d$wfe/sd(d$wfe)
  d$ffe<-(var_ffe**0.5)*d$ffe/sd(d$ffe)
  d$yfe<-(var_yfe**0.5)*d$yfe/sd(d$yfe)
  d$u<-(var_u**0.5)*d$u
  
  n <- nrow(d)
  
  d$DUREE <- sample(c(360, 90), size=n, replace=TRUE)
  d$NBHEUR <- sample(c(35, 20), size=n, replace=TRUE)
  d$DOMEMPL <- 9
  d[, NBHEUR := round(NBHEUR*DUREE/7,0)]
  d[, lnhwage := 2.6+par_age*AGE+par_age2*((AGE-40)^2)+par_age3*((AGE-40)^3)+wfe+ffe+yfe+u]
  d[, lnwage := lnhwage+log(NBHEUR)]
  
  d$ident <- as.factor(d$ident)
  d$firm <- as.factor(d$firm)
  
  list(d, di, df)
}

# Simulation from an existing panel (simulating wages while keeping the mobility structure)
# It is difficult to simulate sorting through simulated wages uniquely, given existing mobility structure.
# To avoid this difficulty, we use estimated firm fixed effects as a starting point for simulated data
# so that firms connected by mobilities effectively have closer FFE than random firms
wagesim <- function(d,
                    newf=1,          # Parameter for adding randomness to estimated FFE
                    r=0.5,           # parameter for correlation between worker and firm fixed effects
                    b=0.8,           # parameter for segregation
                    var_wfe=0.15,    # Parameter for variance of WFE 
                    var_ffe=0.014,   # Parameter for variance of FFE 
                    var_u=0.01,      # Parameter for variance of u 
                    var_yfe=0,       # Parameter for variance of year fixed effects
                    par_age=0.012,   # Parameter for variable AGE in equation
                    par_age2=-0.0005,   # Parameter for variable (AGE-40)^2 in equation
                    par_age3=0.000007,  # Parameter for variable (AGE-40)^2 in equation
                    par_age_wfe=-0.01,  # Parameter for negative correlation between age and WFE
                    ytrend = 0       # Parameter for trend in year effects
                    ){
  yearmin <- min(d$year)
  yearmax <- max(d$year)
  nyears <- yearmax-yearmin+1
  yfe <- rnorm(nyears)
  for (y in 1:nyears){
    yfe[y] <- yfe[y] + y*ytrend
  }
  d[, sim_yfe := yfe[year-yearmin+1]]
  
  firms <- d[,list(.N, mean(ffe, na.rm=TRUE)),by=firm_id]
  names(firms) <- c("firm_id","nobs","ffe")
  firms[is.na(ffe),ffe:=0] # Firms outside the main connected components are given an average fixed effect.
  firms$sim_ffe <- firms$ffe + newf*rnorm(nrow(firms))
  firms$sim_mean_w <- r*firms$sim_ffe + rnorm(nrow(firms))
  d <- merge(d, firms[,c("firm_id","sim_ffe","sim_mean_w")], by="firm_id")
  
  indivs <- d[,list(.N, mean(sim_mean_w, na.rm=TRUE), mean(AGE-year+2002, na.rm=TRUE)),by=indiv_id]
  names(indivs) <- c("indiv_id","nobs","mean_f","mean_gen")
  indivs$sim_wfe <- b*indivs$mean_f + rnorm(nrow(indivs)) + par_age_wfe*indivs$mean_gen
  d <- merge(d, indivs[,c("indiv_id","sim_wfe")], by="indiv_id")

  d$sim_u <- rnorm(nrow(d))
  
  m <- d[,list(mean(sim_wfe, na.rm=TRUE),
               mean(sim_ffe, na.rm=TRUE),
               mean(sim_yfe, na.rm=TRUE),
               sd(sim_wfe, na.rm = TRUE),
               sd(sim_ffe, na.rm = TRUE),
               sd(sim_yfe, na.rm = TRUE))]
  
  d[, c("wfe","ffe","yfe","u") := list(
    (var_wfe**0.5)*(sim_wfe-m[[1]])/m[[4]],
    (var_ffe**0.5)*(sim_ffe-m[[2]])/m[[5]],
    (var_yfe**0.5)*(sim_yfe-m[[3]])/m[[6]],
    (var_u**0.5)*sim_u)]
  
  d[, Xb := par_age*AGE+par_age2*((AGE-40)^2)+par_age3*((AGE-40)^3)]
  
  d[, lnhwage := 2.6+Xb+wfe+ffe+yfe+u]
  d[, hyearmean := mean(lnhwage, na.rm=TRUE), by = year]
  d[, loghourly := lnhwage - hyearmean]
  
  d
}

simcomp <- function(p=6, startyear=2002, saverep = saverepdt,
                  estab=FALSE, year_method="yearlymean", agemin=16, agemax=70, dureemin=360, 
                  sizemin=1, assoc=TRUE, ...){
  
  # We start with a classic AKM estimation on a panel
  if (year_method=="yearlymean"){
    metho <- "yearly mean"
    yname <- "loghourly"
  }
  if (year_method=="yearFE"){
    metho <- "Card"
    yname <- "lnhwage"
  }
  
  savn <- paste(saverep,"real_AKM",startyear,startyear+p-1,metho, sep="")
  
  d <- fast_panel(p=p, startyear=startyear, saverep = saverep, estab=estab, cluster=FALSE, 
                  statdes=TRUE, saveopt=TRUE, metho=metho, agemin=agemin, agemax=agemax, 
                  dureemin=dureemin, sizemin=sizemin, assoc=assoc)
  
  r <- estim(d, startyear=startyear, split_metho="none", metho=metho, sizelim=sizemin, p=p,
                         yname=yname, rotat_year=FALSE, saveopt=TRUE, savn=savn, 
                         return_data=TRUE)

  d <- as.data.table(r[[2]])
  results <- r[[1]]
  print(results)
  rm(r)
  gc()
  

  # We generate the simulated counterpart of our data:
  d$ffe <- d$ffe_all
  dsim <- wagesim(d, ...) # simulation parameters can be set in simcomp call, symbolized as "..."
  rm(d)
  gc()
  
  # We compute the ground truth, variance decomposition with the simulated fixed effects:
  dsim$ffe_all <- dsim$ffe
  dsim$wfe_all <- dsim$wfe
  dsim$yfe_all <- dsim$yfe
  dec <- split_fast_decompo(dsim, y_name=yname, method=metho)
  dec$estimethod <- "sim_baseline"
  #results <- rbind(results, dec, fill=T)
  print(dec)
  savn <- paste(saverep,"Sim_groundtruth_",startyear,startyear+p-1,metho,".csv", sep="")
  write.csv(dec, file = savn)
  

  # We then reestimate the model on the simulated wages, corrected and uncorrected:
  dsim[, c("wfe_all","ffe_all","yfe_all"):=NULL] #these column names need to be available for estimation 
  # uncorrected estimation
  savn <- paste(saverep,"sim_AKM",startyear,startyear+p-1,metho, sep="")
  r <- estim(dsim, startyear=startyear, split_metho="none", metho=metho,
             sizelim=sizemin, p=p,yname=yname, rotat_year=FALSE, saveopt=TRUE,
             savn=savn, return_data=TRUE)
  
  dsim <- r[[2]]
  res <- r[[1]]
  rm(r)
  gc()
  #results <- rbind(results, res, fill=T)
  savn <- paste(saverep,"Sim_uncorrectedAKM_",startyear,startyear+p-1,metho,".csv", sep="")
  write.csv(res, file = savn)
  
  #corrected by firm split
  savn <- paste(saverep,"sim_fsplit",startyear,startyear+p-1,metho, sep="")
  r <- estim(dsim, startyear=startyear, split_metho="by_firm_movers", metho=metho,
             sizelim=sizemin, p=p, yname=yname, rotat_year=FALSE, saveopt=TRUE, 
             savn=savn, return_data=TRUE)

  dsim <- r[[2]]  
  res <- r[[1]]
  rm(r)
  gc()
  #results <- rbind(results, res, fill=T)
  savn <- paste(saverep,"Sim_correctedAKM_",startyear,startyear+p-1,metho,".csv", sep="")
  write.csv(res, file = savn)
  
  # Finally we recompute the groundtruth (variance decomposition with exact simulated FE)
  #  on the split firm selection and the AKM selection
  # Restriction to firms belonging to the main connected component
  dsim[, splitsamp := ((!is.na(ffe1))&(!is.na(ffe0)))]
  # Restriction to firms belonging to both main connected components of split firms
  dsim[, akmsamp := !is.na(ffe_all)]
  
  dsim$ffe_all <- dsim$ffe
  dsim$wfe_all <- dsim$wfe
  dsim$yfe_all <- dsim$yfe
  dec <- split_fast_decompo(dsim[splitsamp==TRUE,], y_name=yname, method=metho)
  dec$estimethod <- "sim_baseline_splitcomp"
  #results <- rbind(results, dec, fill=T)
  savn <- paste(saverep,"Sim_baseline_splitcomp_",startyear,startyear+p-1,metho,".csv", sep="")
  write.csv(dec, file = savn)
  
  dec <- split_fast_decompo(dsim[akmsamp==TRUE,], y_name=yname, method=metho)
  dec$estimethod <- "sim_baseline_akmcomp"
  #results <- rbind(results, dec, fill=T)
  savn <- paste(saverep,"Sim_baseline_akmcomp_",startyear,startyear+p-1,metho,".csv", sep="")
  write.csv(dec, file = savn)
  
  savn <- paste(saverep,"All_sims_",startyear,startyear+p-1,metho,".csv", sep="")
  write.csv(results, file = savn)
  
  results
}


###################################
# Firm-size sorting decomposition #
###################################
# We use "estim" function "groupvar" parameter to decompose variance within groups and between groups

size_decomp <- function(startyearlist=c(2002,2014), 
                        split_metho_list=c("none","by_firm_movers"),
                        year_method="yearlymean", saverep=saverepdt, ...){
  results <- list()
  if (year_method=="yearlymean"){
    metho <- "yearly mean"
    yname <- "loghourly"
  }
  if (year_method=="yearFE"){
    metho <- "Card"
    yname <- "lnhwage"
  }
  for (year in startyearlist){
    d <- fast_panel(startyear=year, statdes=FALSE, metho=metho, ...)
    d[,firmsize_cat:=cut(firmsize, breaks=c(0,20,200,1000,Inf))]
    savn <- paste(saverep,"size_decomp",year,metho, sep="")
    for (split_metho in split_metho_list){
      esti <- estim(d, startyear=year, split_metho=split_metho, return_data=TRUE, rotat_year=FALSE,
                    groupvar="firmsize_cat", yname=yname, metho=metho, savn=savn)
      results <- rbind(setDT(results), setDT(esti[[1]]))
    }
  }
  savename <- paste(saverep,"groupdecomp_","firmsizecat_",".csv", sep="")
  write.csv(results, file = savename)
  results
}

firm_network <- function(d){
  # function to build the network of firms linked by workers' mobilities
  
  # first, we select only movers, and only one line per worker/firm couple
  d <- d[,.(indiv_id, firm_id, nbfirms, mover)]
  d <- d[mover==TRUE,]
  d <- unique(d)
  
  g <- merge(d,d,by="indiv_id",allow.cartesian = T)
  g <- unique(g[,c("firm_id.x","firm_id.y")])
  g <- g[firm_id.x < firm_id.y,]
  
  return(g)
}

network_centrality <- function(g){
  # function to compute centrality measures in the mobility network of firms
  # for each firm
  g <- g[,.(firm_id.x,firm_id.y)]
  gr <- graph_from_edgelist(as.matrix(g),directed = F)
  
  uu1 <- eigen_centrality(gr)
  uu2 <- eigen_centrality(gr,scale=F)
  vv <- centr_degree(gr)
  
  name1 <- attr(uu1$vector,"name")
  rr1 <- data.frame(name1,uu1$vector)
  
  name2 <- attr(uu2$vector,"name")
  rr2 <- data.frame(name2,uu2$vector)
  
  name3 <- V(gr)$name
  rr3 <- data.frame(name3,vv$res)
  
  rr <- merge(rr1,rr2,by.x="name1",by.y="name2")
  rr <- merge(rr,rr3,by.x="name1",by.y="name3")
  
  colnames(rr) <- c("firm_id","eig_sc","eig","deg")

  return(rr)
}

centrality_stat <- function(d){
  # function that uses the results of network_centrality and firm_network
  # to compute stat on the centrality of firm, overall and per sizegroup
  d[, nbfirms := (length(unique(firm_id))), by = indiv_id]
  d[, mover := nbfirms>1]
  df <- d[, list(.N, mean(firmsize, na.rm=TRUE), sum(mover), sum(nbfirms-1),
                 max(!is.na(ffe1+ffe0)), max(!is.na(ffe_all))), by=firm_id]
  names(df) <- c("firm_id","nobs","size","nmovers","nconnexions","splitcomps", "akmcomp")
  df[,firmsize_cat:=cut(size, breaks=c(0,20,200,1000,Inf))]
  
  g <- firm_network(d)
  f <- network_centrality(g)

  df <- merge(df, f, by="firm_id", all.x=TRUE)
  
  centrality <- df[, list(.N,sum(nobs, na.rm=T),mean(size, na.rm=T),mean(nmovers, na.rm=T),mean(nconnexions, na.rm=T),
                          mean(eig_sc, na.rm=TRUE), mean(eig, na.rm=TRUE), 
                          mean(deg, na.rm=TRUE)), 
                   by=firmsize_cat]
  centrality_all <- df[, list(.N,sum(nobs, na.rm=T),mean(size, na.rm=T),mean(nmovers, na.rm=T),mean(nconnexions, na.rm=T),
                              mean(eig_sc, na.rm=TRUE), mean(eig, na.rm=TRUE), 
                              mean(deg, na.rm=TRUE))]
  centrality_all$firmsize_cat <- "All firms"
  centrality <- rbind(centrality, centrality_all)
  common_names <- c("firmsize_cat","nfirms","nobs","firmsize","nmovers","nconnexions",
                    "eig_sc","eig","deg")
  names(centrality) <- common_names
  centrality$sample <- "All sample"
  
  
  centrality_akm <- df[akmcomp==1, list(.N,sum(nobs, na.rm=T),mean(size, na.rm=T),mean(nmovers, na.rm=T),mean(nconnexions, na.rm=T),
                                        mean(eig_sc, na.rm=TRUE), mean(eig, na.rm=TRUE), 
                                        mean(deg, na.rm=TRUE)), 
                       by=firmsize_cat]
  centrality_akm_all <- df[akmcomp==1, list(.N,sum(nobs, na.rm=T),mean(size, na.rm=T),mean(nmovers, na.rm=T),mean(nconnexions, na.rm=T),
                                            mean(eig_sc, na.rm=TRUE), mean(eig, na.rm=TRUE), 
                                            mean(deg, na.rm=TRUE))]
  centrality_akm_all$firmsize_cat <- "All firms"
  centrality_akm <- rbind(centrality_akm, centrality_akm_all)
  names(centrality_akm) <- common_names
  centrality_akm$sample <- "AKM"
  
  
  centrality_splits <- df[splitcomps==1, list(.N,sum(nobs, na.rm=T),mean(size, na.rm=T),mean(nmovers, na.rm=T),mean(nconnexions, na.rm=T),
                                              mean(eig_sc, na.rm=TRUE), mean(eig, na.rm=TRUE), 
                                              mean(deg, na.rm=TRUE)),
                          by=firmsize_cat]
  centrality_splits_all <- df[splitcomps==1, list(.N,sum(nobs, na.rm=T),mean(size, na.rm=T),mean(nmovers, na.rm=T),mean(nconnexions, na.rm=T),
                                                  mean(eig_sc, na.rm=TRUE), mean(eig, na.rm=TRUE), 
                                                  mean(deg, na.rm=TRUE))]
  centrality_splits_all$firmsize_cat <- "All firms"
  centrality_splits <- rbind(centrality_splits, centrality_splits_all)
  names(centrality_splits) <- common_names
  centrality_splits$sample <- "Split samples"
  
  c <- rbind(centrality, centrality_akm, centrality_splits)
  
  
  return(c)
}

#########################
# BLM clusters dataprep #
#########################

build_panel <- function(startyear, endyear, minsize=1, minage=16, maxage=70, dureemin=360){
  tic("building panel")
  panel <- list()
  p <- endyear - startyear +1
  for (y in 1:p){
    print(paste("reading file for year ", y-1+startyear))
    panel[[y]] <- read.fst(paste(fstrep, y-1+startyear, ".fst", sep=""))
  }
  print("loading done, binding years in panel")
  d <- rbindlist(panel)
  rm(panel)
  # age
  d <- d[AGE<=maxage,]
  d <- d[AGE>=minage,]
  d <- d[DUREE>=dureemin,]
  d[, c("age_2","age_3") := list((AGE-40)^2,(AGE-40)^3)]
  
  d[, firmsize := uniqueN(indiv_id), by=firm_id]
  d <- d[d$firmsize>=minsize,]
  
  toc()
  
  d
}

year2_reduce <- function(panel){
  tic("panel prep for clustering")
  panel <- panel[,c("indiv_id","firm_id","year","loghourly")]
  panel[,f:=paste("f",firm_id, sep="")]
  panel <- panel[order(year)]
  panel[, min_year:=min(year), by=f]
  panel <- panel[(year<min_year+2)]
  panel[, nbfirms := (length(unique(f))), by = indiv_id]
  
  tic("movers file")
  movers <- panel[nbfirms>1]
  movers <- unique(movers, by=c("indiv_id","firm_id"))
  movers[, n_order := rowid(indiv_id)]
  movers <- movers[n_order<3]
  m1 <- dcast(movers, indiv_id ~ n_order, value.var = "f")
  m2 <- dcast(movers, indiv_id ~ n_order, value.var = "loghourly")
  movers <- cbind(m1, m2[,2:3])
  names(movers) <- c("indiv_id","f1","f2","y1","y2")
  movers <- na.omit(movers)
  toc()
  
  tic("stayers file")
  stayers <- panel[nbfirms==1]
  stayers[, n_order := rowid(indiv_id)]
  stayers <- stayers[n_order<3]
  s1 <- dcast(stayers, indiv_id ~ n_order, value.var = "f")
  s2 <- dcast(stayers, indiv_id ~ n_order, value.var = "loghourly")
  stayers <- cbind(s1, s2[,2:3])
  names(stayers) <- c("indiv_id","f1","f2","y1","y2")
  stayers <- na.omit(stayers)
  toc()
  toc()
  
  return(list(movers, stayers))
}

